This is an academic explotory analysis done for the R class and the dataset used is the “Wine Quality”. It can be found on data.world or on kaggle.com. This dataset has separated csv’s, winequality-white.csv and winequality-red.csv, where the inputs are physicochemical variables and the the output is a sensory score variable given by sommeliers.
Paulo Cortez, University of Minho, Guimarães, Portugal, http://www3.dsi.uminho.pt/pcortez A. Cerdeira, F. Almeida, T. Matos and J. Reis, Viticulture Commission of the Vinho Verde Region(CVRVV), Porto, Portugal @2009
For more information, read [Cortez et al., 2009]. Input variables (based on physicochemical tests): 1 - fixed acidity 2 - volatile acidity 3 - citric acid 4 - residual sugar 5 - chlorides 6 - free sulfur dioxide 7 - total sulfur dioxide 8 - density 9 - pH 10 - sulphates 11 - alcohol Output variable (based on sensory data): 12 - quality (score between 0 and 10)
library(needs)
needs(tidyverse)
needs(data.table)
needs(gapminder)
needs(gridExtra)
needs(readxl)
needs(GGally)
needs(infer)
needs(nortest)
needs(car)
needs(broom)
needs(agricolae)
needs(dygraphs)
needs(D3plusR) # devtools::install_github('paulofelipe/D3plusR')
needs(highcharter)
needs(leaflet)
needs(collapsibleTree)
needs(magrittr)
needs(corrplot)
needs(plotly)
needs(lmtest)
needs(sandwich)
needs(DT)
# The import function will generate a parsing error for the column "total sulfur dioxide" because the rows 1296 ans 1297 are not integers, the reason why I'm using the col_types parameter.
base_red = read_csv("winequality-red.csv",col_types = "dddddddddddi")
base_white = read_csv("winequality-white.csv")
## Parsed with column specification:
## cols(
## `fixed acidity` = col_double(),
## `volatile acidity` = col_double(),
## `citric acid` = col_double(),
## `residual sugar` = col_double(),
## chlorides = col_double(),
## `free sulfur dioxide` = col_double(),
## `total sulfur dioxide` = col_double(),
## density = col_double(),
## pH = col_double(),
## sulphates = col_double(),
## alcohol = col_double(),
## quality = col_integer()
## )
#Adding the "type" columns before merge the datasets.
base_red = base_red %>% mutate(type = "red")
base_white = base_white %>% mutate(type = "white")
df_wine = rbind(base_red,base_white)
#Convert the new column to factor type
df_wine$type = as.factor(df_wine$type)
#Removing the white spaces
columns = names(df_wine)
columns = gsub(" ", "_", columns)
names(df_wine) = columns
#Checking the NA's
sum(is.na(df_wine) == TRUE)
## [1] 0
table(df_wine$quality,df_wine$type)
##
## red white
## 3 10 20
## 4 53 163
## 5 681 1457
## 6 638 2198
## 7 199 880
## 8 18 175
## 9 0 5
datatable(df_wine, options = list(pageLength = 5))
plot(df_wine)
hchart(cor(df_wine[1:12]),title="test")
summary(df_wine)
## fixed_acidity volatile_acidity citric_acid residual_sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.400 1st Qu.:0.2300 1st Qu.:0.2500 1st Qu.: 1.800
## Median : 7.000 Median :0.2900 Median :0.3100 Median : 3.000
## Mean : 7.215 Mean :0.3397 Mean :0.3186 Mean : 5.443
## 3rd Qu.: 7.700 3rd Qu.:0.4000 3rd Qu.:0.3900 3rd Qu.: 8.100
## Max. :15.900 Max. :1.5800 Max. :1.6600 Max. :65.800
## chlorides free_sulfur_dioxide total_sulfur_dioxide
## Min. :0.00900 Min. : 1.00 Min. : 6.0
## 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 77.0
## Median :0.04700 Median : 29.00 Median :118.0
## Mean :0.05603 Mean : 30.53 Mean :115.7
## 3rd Qu.:0.06500 3rd Qu.: 41.00 3rd Qu.:156.0
## Max. :0.61100 Max. :289.00 Max. :440.0
## density pH sulphates alcohol
## Min. :0.9871 Min. :2.720 Min. :0.2200 Min. : 8.00
## 1st Qu.:0.9923 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.: 9.50
## Median :0.9949 Median :3.210 Median :0.5100 Median :10.30
## Mean :0.9947 Mean :3.219 Mean :0.5313 Mean :10.49
## 3rd Qu.:0.9970 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.:11.30
## Max. :1.0390 Max. :4.010 Max. :2.0000 Max. :14.90
## quality type
## Min. :3.000 red :1599
## 1st Qu.:5.000 white:4898
## Median :6.000
## Mean :5.818
## 3rd Qu.:6.000
## Max. :9.000
summary(df_wine %>%filter(type == 'red'))
## fixed_acidity volatile_acidity citric_acid residual_sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## chlorides free_sulfur_dioxide total_sulfur_dioxide
## Min. :0.01200 Min. : 1.00 Min. : 6.00
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00
## Median :0.07900 Median :14.00 Median : 38.00
## Mean :0.08747 Mean :15.87 Mean : 46.47
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00
## Max. :0.61100 Max. :72.00 Max. :289.00
## density pH sulphates alcohol
## Min. :0.9901 Min. :2.740 Min. :0.3300 Min. : 8.40
## 1st Qu.:0.9956 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50
## Median :0.9968 Median :3.310 Median :0.6200 Median :10.20
## Mean :0.9967 Mean :3.311 Mean :0.6581 Mean :10.42
## 3rd Qu.:0.9978 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10
## Max. :1.0037 Max. :4.010 Max. :2.0000 Max. :14.90
## quality type
## Min. :3.000 red :1599
## 1st Qu.:5.000 white: 0
## Median :6.000
## Mean :5.636
## 3rd Qu.:6.000
## Max. :8.000
summary(df_wine %>%filter(type == 'white'))
## fixed_acidity volatile_acidity citric_acid residual_sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700
## Median : 6.800 Median :0.2600 Median :0.3200 Median : 5.200
## Mean : 6.855 Mean :0.2782 Mean :0.3342 Mean : 6.391
## 3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3900 3rd Qu.: 9.900
## Max. :14.200 Max. :1.1000 Max. :1.6600 Max. :65.800
## chlorides free_sulfur_dioxide total_sulfur_dioxide
## Min. :0.00900 Min. : 2.00 Min. : 9.0
## 1st Qu.:0.03600 1st Qu.: 23.00 1st Qu.:108.0
## Median :0.04300 Median : 34.00 Median :134.0
## Mean :0.04577 Mean : 35.31 Mean :138.4
## 3rd Qu.:0.05000 3rd Qu.: 46.00 3rd Qu.:167.0
## Max. :0.34600 Max. :289.00 Max. :440.0
## density pH sulphates alcohol
## Min. :0.9871 Min. :2.720 Min. :0.2200 Min. : 8.00
## 1st Qu.:0.9917 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50
## Median :0.9937 Median :3.180 Median :0.4700 Median :10.40
## Mean :0.9940 Mean :3.188 Mean :0.4898 Mean :10.51
## 3rd Qu.:0.9961 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40
## Max. :1.0390 Max. :3.820 Max. :1.0800 Max. :14.20
## quality type
## Min. :3.000 red : 0
## 1st Qu.:5.000 white:4898
## Median :6.000
## Mean :5.878
## 3rd Qu.:6.000
## Max. :9.000
#Mean and median
red_quality_mean = df_wine %>% select(quality,type) %>% filter(type=='red')
mean(red_quality_mean$quality)
## [1] 5.636023
median(red_quality_mean$quality)
## [1] 6
white_quality_mean = df_wine %>% select(quality,type) %>% filter(type=='white')
mean(white_quality_mean$quality)
## [1] 5.877909
median(white_quality_mean$quality)
## [1] 6
ggplot(red_quality_mean, aes(x = quality)) +
geom_histogram() +
ggtitle('Red wine score distribution')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(white_quality_mean, aes(x = quality)) +
geom_histogram() +
ggtitle('White wine score distribution')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(ggplot(df_wine, aes(x=alcohol, y=volatile_acidity, color= as.factor(quality),alpha=0.5)) +
facet_wrap('type', ncol=2) +
geom_point() +
ggtitle('Alcohol X Volatile_acidity by type'))
ggplotly(ggplot(df_wine, aes(x=alcohol, y=volatile_acidity, color=type,alpha=0.5)) + geom_point() +
facet_wrap('quality', ncol=2)+
ggtitle('Alcohol X Volatile_acidity by quality'))
ggplotly(ggplot(df_wine, aes(x=alcohol, y=density, color= as.factor(quality),alpha=0.5)) +
facet_wrap('type', ncol=2) +
geom_point()+
ggtitle('Alcohol X Density by type'))
ggplotly(ggplot(df_wine, aes(x=alcohol, y=density, color=type,alpha=0.5)) + geom_point() +
facet_wrap('quality', ncol=2)+
ggtitle('Alcohol X Density by quality'))
ggplotly(ggplot(aes(x=alcohol,alpha=0.5),data =df_wine) +
geom_density(aes(fill = type))+
facet_wrap(~quality)+
ggtitle('Alcohol X quality density'))
ggplotly(ggplot(df_wine, aes(x=as.factor(quality), y=alcohol, fill=type,alpha=0.5)) +
geom_boxplot()+
ggtitle('alcohol X quality boxplot'))
ggplotly(ggplot(aes(x=volatile_acidity,alpha=0.5),data =df_wine) +
geom_density(aes(fill = type))+
facet_wrap(~quality)+
ggtitle('Volatile_acidity X quality density'))
ggplotly(ggplot(df_wine, aes(x=as.factor(quality), y=volatile_acidity, fill=type,alpha=0.5)) +
geom_boxplot()+
ggtitle('Volatile_acidity X quality boxplot'))
ggplotly(ggplot(aes(x=density,alpha=0.5),data =df_wine) +
geom_density(aes(fill = type))+
facet_wrap(~quality)+
ggtitle('Density X quality density'))
ggplotly(ggplot(df_wine, aes(x=as.factor(quality), y=density, fill=type,alpha=0.5)) +
geom_boxplot()+
ggtitle('Density X quality boxplot'))
t.test(quality ~ type, data = df_wine)
##
## Welch Two Sample t-test
##
## data: quality by type
## t = -10.149, df = 2950.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.2886173 -0.1951564
## sample estimates:
## mean in group red mean in group white
## 5.636023 5.877909
corMatrix_dfwine <- cor(df_wine[1:12])
corrplot(corMatrix_dfwine, method = "ellipse")
#quality by alcohol
modelo1 <- lm(quality ~ alcohol, data = df_wine[1:12])
summary(modelo1)
##
## Call:
## lm(formula = quality ~ alcohol, data = df_wine[1:12])
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5042 -0.4957 -0.0488 0.5043 3.2115
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.405269 0.085941 27.99 <2e-16 ***
## alcohol 0.325312 0.008139 39.97 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7824 on 6495 degrees of freedom
## Multiple R-squared: 0.1974, Adjusted R-squared: 0.1973
## F-statistic: 1598 on 1 and 6495 DF, p-value: < 2.2e-16
plot(modelo1)
#quality by alcohol + volatile_acidity
modelo2 <- lm(quality ~ alcohol + volatile_acidity, data = df_wine[1:12])
summary(modelo2)
##
## Call:
## lm(formula = quality ~ alcohol + volatile_acidity, data = df_wine[1:12])
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4388 -0.4809 -0.0392 0.4843 3.1187
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.926559 0.085526 34.22 <2e-16 ***
## alcohol 0.318441 0.007824 40.70 <2e-16 ***
## volatile_acidity -1.322473 0.056680 -23.33 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7516 on 6494 degrees of freedom
## Multiple R-squared: 0.2595, Adjusted R-squared: 0.2593
## F-statistic: 1138 on 2 and 6494 DF, p-value: < 2.2e-16
plot(modelo2)
#quality by alcohol + volatile_acidity + density
modelo3 <- lm(quality ~ alcohol + volatile_acidity +density , data = df_wine[1:12])
summary(modelo3)
##
## Call:
## lm(formula = quality ~ alcohol + volatile_acidity + density,
## data = df_wine[1:12])
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4295 -0.4846 -0.0343 0.4706 3.0262
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -35.11411 4.57246 -7.679 1.83e-14 ***
## alcohol 0.38253 0.01095 34.935 < 2e-16 ***
## volatile_acidity -1.49092 0.05991 -24.887 < 2e-16 ***
## density 37.62502 4.52171 8.321 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7477 on 6493 degrees of freedom
## Multiple R-squared: 0.2673, Adjusted R-squared: 0.267
## F-statistic: 789.6 on 3 and 6493 DF, p-value: < 2.2e-16
plot(modelo3)
anova(modelo1,modelo2)
## Analysis of Variance Table
##
## Model 1: quality ~ alcohol
## Model 2: quality ~ alcohol + volatile_acidity
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 6495 3975.7
## 2 6494 3668.2 1 307.51 544.39 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model 2 is better than model 1 because the p value is less than 5%, so we accept H1.
anova(modelo2,modelo3)
## Analysis of Variance Table
##
## Model 1: quality ~ alcohol + volatile_acidity
## Model 2: quality ~ alcohol + volatile_acidity + density
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 6494 3668.2
## 2 6493 3629.5 1 38.704 69.238 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Model 3 is better than model 2, so we keep this model considering the features above.